関数定義ほか

lagdiff <- function(n) {                      # 前日差を求める関数
  n - dplyr::lag(n, default = 0L)
}

ma7 <- function(n) {                          # 移動平均(7日)を求める関数
  zoo::rollmeanr(n, k = 7L, na.pad = TRUE)
}

ma28 <- function(n) {                         # 移動平均(28日)を求める関数
  zoo::rollmeanr(n, k = 28L, na.pad = TRUE)
}

daily_aggregate <- function(df, date, key) {  # 日時集計を行う関数
  date <- dplyr::enquo(date)
  key <- dplyr::enquo(key)
  
  df %>% 
    dplyr::group_by(!!date, !!key) %>% 
    dplyr::summarise(n = dplyr::n()) %>% 
    dplyr::ungroup() %>% 
    tidyr::complete(
      date = seq.Date(from = min(!!date), to = max(!!date), by = "day"),
      !!key, fill = list(n = 0L)
    ) %>% 
    dplyr::group_by(!!key) %>% 
    tidyr::nest() %>%
    dplyr::mutate(
      diff = purrr::map(data, ~ lagdiff(.$n)),   # 前日差
      cum = purrr::map(data, ~ cumsum(.$n)),     # 累計
      ma7 = purrr::map(data, ~ ma7(.$n)),        # 移動平均(7日)
      ma28 = purrr::map(data, ~ ma28(.$n))       # 移動平均(28日)
    ) %>% 
    tidyr::unnest(cols = c(data, diff, cum, ma7, ma28)) %>% 
    return()
}

subtitle <- paste0("Generated @", lubridate::now())
caption <- "Data Source: covid19japan.com"

Import & Tidy

個票データの集計に限らず、データをインポートした際には目的に応じて各変量(変数)の変数型を変更します。特に水準ごとに層別処理を行いたい場合には因子型に変換しておくと便利です。また、結合したいデータと名称や体系を合わせておくこともポイントです。

 

都道府県データ

prefs <- "https://gist.githubusercontent.com/k-metrics/9f3fc18e042850ff24ad9676ac34764b/raw/f4ea87f429e1ca28627feff94b67c8b2432aee59/pref_utf8.csv" %>% 
  readr::read_csv() %>% 
  dplyr::mutate(
    # Googleの予測データと結合するためにコード体系を合わせる
    japan_prefecture_code = paste0("JP-", `コード`)
  ) %>% 

  dplyr::select(
    # Googleの予測データと結合するために名称を変更する
    japan_prefecture_code, prefecture_name = pref,
    # 日本語の変数名は扱いにくいので英語名に変更する
    pref = `都道府県`, region = `八地方区分`, pops = `推計人口`
  ) %>% 
  dplyr::mutate(
    # 水準ごとに表示させるために因子化する(あらかじめデータをコード順に並べて
    # おくことが因子化の際のポイントのひとつ)
    japan_prefecture_code = forcats::fct_inorder(japan_prefecture_code),
    pref = forcats::fct_inorder(pref),
    region = forcats::fct_inorder(region),
    pops = as.integer(pops)
  )

prefs

 

Covid19Japan個票データ

df <- "https://raw.githubusercontent.com/reustle/covid19japan-data/master/docs/patient_data/latest.json" %>% 
  jsonlite::fromJSON() %>% 
  dplyr::select(patientId, date = dateAnnounced, gender,
                pref = detectedPrefecture, patientStatus, knownCluster,
                confirmedPatient, ageBracket,
                deceasedDate, deceasedReportedDate) %>% 
  dplyr::filter(confirmedPatient == TRUE) %>% 
  dplyr::mutate(
    date = lubridate::as_date(date),
    gender = forcats::as_factor(gender),
    pref = stringr::str_to_lower(pref),
    patientStatus = forcats::as_factor(patientStatus),
    cluster = dplyr::if_else(!is.na(knownCluster), TRUE, FALSE),
    ageBracket = forcats::as_factor(ageBracket),
    deceasedDate = lubridate::as_date(deceasedDate),
    deceasedReportedDate = lubridate::as_date(deceasedReportedDate)
  ) %>% 
  # 都道府県データと結合
  dplyr::left_join(prefs, by = c("pref" = "prefecture_name")) %>% 
  dplyr::select(-pref) %>% 
  dplyr::rename(pref = pref.y) %>% 
  # 因子型の欠損値を水準化する
  dplyr::mutate(
    japan_prefecture_code = forcats::fct_explicit_na(japan_prefecture_code,
                                                     na_level = "JP-99"),
    pref = forcats::fct_explicit_na(pref, na_level = "空港検疫"),
    region = forcats::fct_explicit_na(region, na_level = "空港検疫"),
    gender = forcats::fct_explicit_na(gender, na_level = "非公表"),
    ageBracket = forcats::fct_explicit_na(ageBracket, na_level = "非公表"),
    patientStatus = forcats::fct_explicit_na(patientStatus,
                                             na_level = "Unknown")
  ) %>% 
  dplyr::filter(date < lubridate::today())

df
df %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 368645
Number of columns 14
_______________________
Column type frequency:
character 2
Date 3
factor 6
logical 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
patientId 0 1.00 1 16 0 368645 0
knownCluster 366140 0.01 3 88 0 233 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 2020-01-15 2021-01-25 2020-12-15 363
deceasedDate 368265 0 2020-02-13 2020-11-19 2020-05-08 151
deceasedReportedDate 368315 0 2020-02-13 2020-10-17 2020-05-16 131

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
gender 0 1 FALSE 3 非公表: 258973, M: 61370, F: 48302
patientStatus 0 1 FALSE 9 Unk: 366111, Hos: 1261, Dec: 372, Hom: 315
ageBracket 0 1 FALSE 13 非公表: 259070, 20: 29433, 30: 19042, 40: 16089
japan_prefecture_code 0 1 FALSE 48 JP-: 94576, JP-: 41728, JP-: 38429, JP-: 23628
pref 0 1 FALSE 48 東京都: 94576, 大阪府: 41728, 神奈川: 38429, 埼玉県: 23628
region 0 1 FALSE 9 関東地: 188800, 近畿地: 72944, 中部地: 37817, 九州地: 32196

Variable type: logical

skim_variable n_missing complete_rate mean count
confirmedPatient 0 1 1.00 TRU: 368645
cluster 0 1 0.01 FAL: 366140, TRU: 2505

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
pops 2142 0.99 7924.54 4235.09 560 5107 7537 13822 13822 ▆▅▆▇▇

 

Data Wrangling

 

陽性者判定者数

 

全国集計

japan_daily <- df %>% 
  dplyr::group_by(date) %>% 
  dplyr::summarise(n = dplyr::n()) %>% 
  dplyr::ungroup() %>% 
  tidyr::complete(
    date = seq.Date(from = min(date), to = max(date), by = "day"),
    fill = list(n = 0L)
  ) %>% 
  dplyr::mutate(
    diff = lagdiff(n),   # 前日差
    cum = cumsum(n),     # 累計
    ma7 = ma7(n),        # 移動平均(7日)
    ma28 = ma28(n)       # 移動平均(28日)
  )

japan_daily

 

地方区分別集計

region_daily <- df %>% 
  dplyr::group_by(date, region) %>% 
  dplyr::summarise(n = dplyr::n()) %>% 
  dplyr::ungroup() %>%                         # この処理がポイント
  tidyr::complete(
    date = seq.Date(from = min(date), to = max(date), by = "day"),
    region, fill = list(n = 0L)
  ) %>% 
  print() %>%                                  # 途中結果の表示
  dplyr::group_by(region) %>% 
  tidyr::nest() %>%
  print() %>%                                  # 途中結果の表示
  # 組み合わせグルーピングの場合は purrr パッケージで処理するのが速い
  dplyr::mutate(
    diff = purrr::map(data, ~ lagdiff(.$n)),   # 前日差
    cum = purrr::map(data, ~ cumsum(.$n)),     # 累計
    ma7 = purrr::map(data, ~ ma7(.$n)),        # 移動平均(7日)
    ma28 = purrr::map(data, ~ ma28(.$n))       # 移動平均(28日)
  ) %>% 
  tidyr::unnest(cols = c(data, diff, cum, ma7, ma28))

region_daily
region_daily <- df %>% 
  daily_aggregate(date, region)   # 上記の集計処理を関数化したもの

region_daily

 

都道府県別集計

pref_daily <- df %>% 
  daily_aggregate(date, pref)

pref_daily

 

年代別集計

 

全国集計

ageBracket_daily <- df %>% 
  daily_aggregate(date, ageBracket)

ageBracket_daily

 

クラスター別集計

 

全国集計

cluster_daily <- df %>% 
  daily_aggregate(date, cluster)

cluster_daily

 

Visualize

陽性判定者数

全国集計

subset <- japan_daily
title <- "【全国】陽性者数(単日)"
xlab <- ""
ylab <- ""
sec_scale <- 50

subset %>% 
  ggplot2::ggplot(ggplot2::aes(x = date)) + 
    ggplot2::geom_bar(ggplot2::aes(y = n), stat = "identity", width = 1.0,
                      fill = "dark gray", alpha = 1.0) + 
    ggplot2::geom_line(ggplot2::aes(y = ma7), linetype = "solid",
                       colour = "gray10", size = 0.5) + 
    ggplot2::geom_line(ggplot2::aes(y = cum / sec_scale),
                       colour = "dark green", size = 1.0) +
    ggplot2::scale_y_continuous(
      name = "陽性者数(灰)・移動平均(黒)",
      sec.axis = ggplot2::sec_axis(~ . * sec_scale,
                                   name = "累積陽性者数(濃緑)")) + 
    ggplot2::theme(axis.text.y.left = ggplot2::element_text(colour = "gray10"),
                   axis.line.y.left = ggplot2::element_line(colour = "gray10"),
                   axis.text.y.right = ggplot2::element_text(colour = "dark green"),
                   axis.line.y.right = ggplot2::element_line(colour = "dark green")) +
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab)

 

同前日差

subset <- japan_daily %>% dplyr::mutate(y = diff)
title <- "【全国】前日差(単日)"
xlab <- ""
ylab <- "陽性者数"

subset %>% 
  ggplot2::ggplot(ggplot2::aes(x = date, y = y)) + 
    ggplot2::geom_line(colour = "dark green", alpha = 0.5) +
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab)

 

地方区分別集計

subset <- region_daily %>% dplyr::mutate(key = region, y = n)
title <- "【地方別】陽性者数(単日)"
xlab <- ""
ylab <- "陽性者数"

subset %>% 
  ggplot2::ggplot(ggplot2::aes(x = date, y = y)) + 
    ggplot2::geom_bar(ggplot2::aes(fill = key), stat = "identity",
                      width = 1.0, alpha = 0.5) + 
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption, 
                  x = xlab, y = ylab)

 

subset <- region_daily %>% dplyr::mutate(key = region, y = n)
title <- "【地方別】単日"
xlab <- ""
ylab <- "陽性者数"

subset %>% 
  ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) + 
    ggplot2::geom_line(size = 0.5) +
    ggplot2::theme(legend.position = 'none') +
    ggrepel::geom_text_repel(ggplot2::aes(label = key),
                             data = subset(subset, date == max(date)),
                             nudge_x = 30, segment.alpha = 0.5, size = 4) + 
    ggplot2::lims(x = c(min(subset$date),
                        max(subset$date) + 45)) +
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab) 

 

同累計

subset <- region_daily %>% dplyr::mutate(key = region, y = cum)
title <- "【地方別】累計"
xlab <- ""
ylab <- "陽性者数"

subset %>% 
    ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) + 
    ggplot2::geom_line(size = 1) +
    ggplot2::theme(legend.position = 'none') +
    ggrepel::geom_text_repel(ggplot2::aes(label = key),
                             data = subset(subset, date == max(date)),
                             nudge_x = 30, segment.alpha = 0.5, size = 4) + 
    ggplot2::lims(x = c(min(subset$date),
                        max(subset$date) + 45)) +
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab) 

 

同移動平均(7日)

subset <- region_daily %>% dplyr::mutate(key = region, y = ma7)
title <- "【地方別】移動平均(7日)"
xlab <- ""
ylab <- "陽性者数"

subset %>% 
    ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) + 
    ggplot2::geom_line(size = 1) +
    ggplot2::theme(legend.position = 'none') +
    ggrepel::geom_text_repel(ggplot2::aes(label = key),
                             data = subset(subset, date == max(date)),
                             nudge_x = 30, segment.alpha = 0.5, size = 4) + 
    ggplot2::lims(x = c(min(subset$date),
                        max(subset$date) + 45)) +
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab) 

 

同移動平均(28日)

subset <- region_daily %>% dplyr::mutate(key = region, y = ma28)
title <- "【地方別】移動平均(28日)"
xlab <- ""
ylab <- "陽性者数"

subset %>% 
    ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) + 
    ggplot2::geom_line(size = 1) +
    ggplot2::theme(legend.position = 'none') +
    ggrepel::geom_text_repel(ggplot2::aes(label = key),
                             data = subset(subset, date == max(date)),
                             nudge_x = 30, segment.alpha = 0.5, size = 4) + 
    ggplot2::lims(x = c(min(subset$date),
                        max(subset$date) + 45)) +
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab) 

 

同前日差

subset <- region_daily %>% dplyr::mutate(key = region, y = diff)
title <- "【地方別】前日差(単日)"
xlab <- ""
ylab <- "陽性者数"
ncol <- 3

subset %>% 
  ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) + 
    ggplot2::geom_line(alpha = 0.75) +
    ggplot2::theme(legend.position = 'none') + 
    ggplot2::facet_wrap(~ key, ncol = ncol, scales = "free_y") + 
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab)

 

都道府県別

subset <- pref_daily %>% dplyr::mutate(key = pref, y = n)
title <- "【都道府県別】陽性者数(単日)"
xlab <- ""
ylab <- "陽性者数"

subset %>% 
    ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) + 
    ggplot2::geom_line(size = 0.5) +
    ggplot2::theme(legend.position = 'none') +
    ggrepel::geom_text_repel(ggplot2::aes(label = key),
                             data = subset(subset, date == max(date)),
                             nudge_x = 30, segment.alpha = 0.5, size = 4) + 
    ggplot2::lims(x = c(min(subset$date),
                        max(subset$date) + 45)) +
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab) 

 

subset <- pref_daily %>% dplyr::mutate(key = pref)
title <- "【都道府県別】陽性者数(単日)"
xlab <- ""
ylab <- "陽性者数"
sec_scale <- 50
ncol <- 3

subset %>% 
  ggplot2::ggplot(ggplot2::aes(x = date)) + 
    ggplot2::geom_bar(ggplot2::aes(y = n, fill = key), stat = "identity",
                      alpha = 0.25, width = 1.0) + 
    ggplot2::geom_line(ggplot2::aes(y = ma7, colour = key),
                       linetype = "solid", size = 0.25) + 
    ggplot2::geom_line(ggplot2::aes(y = cum / sec_scale, colour = key)) +
    ggplot2::facet_wrap(~ key, ncol = ncol, scales = "free_y") + 
    ggplot2::theme(legend.position = 'none') + 
    ggplot2::scale_y_continuous(
      name = "陽性者数(棒)・移動平均(細線)",
      sec.axis = ggplot2::sec_axis(~ . * sec_scale,
                                    name = "陽性者数累計(太線)")) +
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab)

 

同前日差

subset <- pref_daily %>% dplyr::mutate(key = pref, y = diff)
title <- "【都道府県別】前日差(単日)"
xlab <- ""
ylab <- "陽性者数"
ncol <- 3

subset %>% 
  ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) + 
    ggplot2::geom_line(alpha = 0.75) +
    ggplot2::theme(legend.position = 'none') + 
    ggplot2::facet_wrap(~ key, ncol = ncol, scales = "free_y") + 
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab)

 

年代別

subset <- ageBracket_daily %>% dplyr::mutate(key = ageBracket, y = n)
title <- "【年代別】陽性者数(単日)"
xlab <- ""
ylab <- "陽性者数"

subset %>% 
  ggplot2::ggplot(ggplot2::aes(x = date, y = y)) + 
    ggplot2::geom_bar(ggplot2::aes(fill = key), stat = "identity",
                      width = 1.0, alpha = 0.5) + 
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption, 
                  x = xlab, y = ylab)

 

クラスター別

subset <- cluster_daily %>% dplyr::mutate(key = cluster, y = n)
title <- "【クラスター別】陽性者数(単日)"
xlab <- ""
ylab <- "陽性者数"


subset %>% 
  ggplot2::ggplot(ggplot2::aes(x = date, y = y)) + 
    ggplot2::geom_bar(ggplot2::aes(fill = key), stat = "identity",
                      width = 1.0, alpha = 0.5) + 
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption, 
                  x = xlab, y = ylab)

 

対数化

地方区分(累計)

subset <- region_daily %>% dplyr::mutate(key = region, y = cum)
title <- "【地方別】累計"
xlab <- ""
ylab <- "陽性者数(常用対数)"

subset %>% 
    ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) + 
    ggplot2::geom_line(size = 1) +
    ggplot2::theme(legend.position = 'none') +
    ggrepel::geom_text_repel(ggplot2::aes(label = key),
                             data = subset(subset, date == max(date)),
                             nudge_x = 30, segment.alpha = 0.5, size = 4) + 
    ggplot2::lims(x = c(min(subset$date),
                        max(subset$date) + 45)) +
    ggplot2::scale_y_log10() + 
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab) 

 

同移動平均(7日)

subset <- region_daily %>% dplyr::mutate(key = region, y = ma7)
title <- "【地方別】移動平均(7日)"
xlab <- ""
ylab <- "陽性者数(常用対数)"

subset %>% 
    ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) + 
    ggplot2::geom_line(size = 1) +
    ggplot2::theme(legend.position = 'none') +
    ggrepel::geom_text_repel(ggplot2::aes(label = key),
                             data = subset(subset, date == max(date)),
                             nudge_x = 30, segment.alpha = 0.5, size = 4) + 
    ggplot2::lims(x = c(min(subset$date),
                        max(subset$date) + 45)) +
    ggplot2::scale_y_log10() + 
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab)

 

同移動平均(28日)

subset <- region_daily %>% dplyr::mutate(key = region, y = ma28)
title <- "【地方別】移動平均(28日)"
xlab <- ""
ylab <- "陽性者数(常用対数)"

subset %>% 
    ggplot2::ggplot(ggplot2::aes(x = date, y = y, colour = key)) + 
    ggplot2::geom_line(size = 1) +
    ggplot2::theme(legend.position = 'none') +
    ggrepel::geom_text_repel(ggplot2::aes(label = key),
                             data = subset(subset, date == max(date)),
                             nudge_x = 30, segment.alpha = 0.5, size = 4) + 
    ggplot2::lims(x = c(min(subset$date),
                        max(subset$date) + 45)) +
    ggplot2::scale_y_log10() + 
    ggplot2::labs(title = title, subtitle = subtitle, caption = caption,
                  x = xlab, y = ylab)

 


CC 4.0 BY-NC-SA, Sampo Suzuki